
## L2 cluster summaries

mod_vf = function(vf){
  vf %>%
    transmute(
      LALVOTERID,
      tract, block_group, block,
      education = case_when(is.na(CommercialData_Education) ~ 'Unkn', T ~ str_sub(CommercialData_Education, 1, 4)),
      home_val = str_replace_all(CommercialData_EstHomeValue, '^\\$', '') %>% as.numeric(),
      income_est = CommercialData_EstimatedHHIncome,
      home_purchase_date = as.Date(CommercialData_HomePurchaseDate, format = '%m/%d/%Y'),
      home_purchase_price = str_replace_all(CommercialData_HomePurchasePrice, '^\\$', '') %>% as.numeric(),
      land_value = str_replace_all(CommercialData_LandValue, '^\\$', '') %>% as.numeric(),
      occupation = CommercialData_OccupationIndustry,
      property_type = CommercialData_PropertyType,
      owner_renter = CommercialDataLL_Home_Owner_Or_Renter,
      net_worth = CommercialDataLL_HH_Net_Worth,
      cong_dist = `US_Congressional_District`,
      gender = Voters_Gender,
      age20 = 2020-year(as.Date(Voters_BirthDate, format = '%m/%d/%Y')),
      party = Parties_Description,
      ethnicity = EthnicGroups_EthnicGroup1Desc,
      state_file,
      gender_l2 = gender,
      fips_l2 = Voters_FIPS,
      lat = Residence_Addresses_Latitude,
      long = Residence_Addresses_Longitude
    ) %>%
    mutate(
      income_est = case_when(
        income_est == '$150000-174999' ~ '150',
        income_est == '$1000-14999' ~ '001',
        income_est == '$15000-24999' ~ '015',
        income_est == '$50000-74999' ~ '050',
        income_est == '$35000-49999' ~ '035',
        income_est == '$75000-99999' ~ '075',
        income_est == '$100000-124999' ~ '100',
        income_est == '$25000-34999' ~ '025',
        income_est == '$125000-149999' ~ '125',
        income_est == '$200000-249999' ~ '200',
        income_est == '$175000-199999' ~ '175',
        income_est == '$250000+' ~ '250',
        T ~ NA_character_
      ),
      net_worth = case_when(
        net_worth == '$100000-249999' ~ '100',
        net_worth == '$250000-499999' ~ '250',
        net_worth == '$25000-49999' ~ '025',
        net_worth == '$50000-99999' ~ '050',
        net_worth == '$499999+' ~ '500',
        net_worth == '$1-4999' ~ '001',
        net_worth == '$10000-24999' ~ '010',
        net_worth == '$5000-9999' ~ '005',
        T ~ NA_character_
      ),
      party = case_when(
        party == 'Democratic' ~ 'D',
        party == 'Republican' ~ 'R',
        party == 'Non-Partisan' | party == 'Declined to State' | party == 'Registered Independent' ~ 'I',
        !is.na(party) ~ 'O',
        T ~ NA_character_),
      ethnicity = case_when(
        ethnicity == 'European' ~ 'W',
        ethnicity == 'Hispanic and Portuguese' ~ 'H',
        ethnicity == 'East and South Asian' ~ 'A',
        ethnicity == 'Likely African-American' ~ 'B',
        is.na(ethnicity) ~ NA_character_,
        T ~ 'O')
    )
}

l2_clusters <- open_dataset('data/all_components') %>%
  filter(str_starts(id, 'L')) %>%
  collect()

if(! 'data.table' %in% class(l2_clusters)) l2_clusters <- as.data.table(l2_clusters)

setDT(l2_clusters, key = 'id')

# find most recent registrant among clusters with > 1 L2 rows
multi <- l2_clusters %>% arrow_table() %>% group_by(component) %>% tally() %>% filter(n>1) %>% pull(component)

multi <- l2_clusters %>% arrow_table() %>% filter(component %in% multi) %>% pull(id)

multi <- rbindlist(
  list(
    open_dataset('data/l2/year=2014') %>%
      transmute(LALVOTERID, reg_date = as.Date(Voters_CalculatedRegDate, '%m/%d/%Y')) %>%
      filter(LALVOTERID %in% multi) %>%
      as.data.table(),
    open_dataset('data/l2/year=2016') %>%
      transmute(LALVOTERID, reg_date = as.Date(Voters_CalculatedRegDate, '%m/%d/%Y')) %>%
      filter(LALVOTERID %in% multi) %>%
      as.data.table(),
    open_dataset('data/l2/year=2020') %>%
      transmute(LALVOTERID, reg_date = as.Date(Voters_CalculatedRegDate, '%m/%d/%Y')) %>%
      filter(LALVOTERID %in% multi) %>%
      as.data.table()
  )
)

multi <- unique(multi)

multi <- merge(multi, l2_clusters[id %chin% multi$LALVOTERID], all.x = T, all.y = F, by.x = 'LALVOTERID', by.y = 'id')

setkey(multi,component)

setorder(multi, component, -reg_date)

# pull most recent, then anti-join; so, retain NOT the most recent
multi <- multi[!multi[,.SD[1],component], on = c('component', 'LALVOTERID')][, reg_date := NULL]

setnames(l2_clusters, 'id', 'LALVOTERID')

# remove these 'not the most recent' rows
l2_clusters <- l2_clusters[!multi, on = c('component', 'LALVOTERID')] # anti-join

## 2012 data
out <- open_dataset('data/l2/year=2014/') %>%
  filter(as.Date(Voters_CalculatedRegDate, '%m/%d/%Y') <= as.Date('11/06/2012', '%m/%d/%Y')) %>%
  mod_vf() %>%
  collect()

if(! 'data.table' %in% class(out)) out <- as.data.table(out)

setDT(out, key = 'LALVOTERID')

out <- unique(out, by = 'LALVOTERID')
out <- l2_clusters[out, on = 'LALVOTERID']
out[, component := fifelse(is.na(component), LALVOTERID, as.character(component))]

write_dataset(out, 'data/l2_summ_2012', max_rows_per_file = 1e6)

## 2016 data
out <- open_dataset('data/l2/year=2016/') %>%
  filter(as.Date(Voters_CalculatedRegDate, '%m/%d/%Y') <= as.Date('11/08/2016', '%m/%d/%Y')) %>%
  mod_vf() %>%
  collect()

if(! 'data.table' %in% class(out)) out <- as.data.table(out)

setDT(out, key = 'LALVOTERID')

out <- unique(out, by = 'LALVOTERID')
out <- l2_clusters[out, on = 'LALVOTERID']
out[, component := fifelse(is.na(component), LALVOTERID, as.character(component))]

write_dataset(out, 'data/l2_summ_2016', max_rows_per_file = 1e6)

## 2020 data
out <- open_dataset('data/l2/year=2020/') %>%
  filter(as.Date(Voters_CalculatedRegDate, '%m/%d/%Y') <= as.Date('11/03/2020', '%m/%d/%Y')) %>%
  mod_vf() %>%
  collect()

if(! 'data.table' %in% class(out)) out <- as.data.frame(out)

setDT(out, key = 'LALVOTERID')

out <- unique(out, by = 'LALVOTERID')
out <- l2_clusters[out, on = 'LALVOTERID']
out[, component := fifelse(is.na(component), LALVOTERID, as.character(component))]

write_dataset(out, 'data/l2_summ_2020', max_rows_per_file = 1e6)
